In [18]:
from google.colab import drive
drive.mount("/content/drive")
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
In [31]:
import pandas as pd
import numpy as np
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
In [20]:
train_csv = '/content/drive/My Drive/train_data.csv'
test_csv = '/content/drive/My Drive/test_data.csv'
train_data = pd.read_csv(train_csv, sep = ",", header= 0)
test_data = pd.read_csv(test_csv, sep = ",", header= 0)
In [21]:
train_data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71656 entries, 0 to 71655
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   company_id   71656 non-null  int64 
 1   information  71656 non-null  object
 2   type         71656 non-null  object
 3   text         71656 non-null  object
 4   lower        71656 non-null  object
dtypes: int64(1), object(4)
memory usage: 2.7+ MB
In [22]:
test_data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   company_id   1000 non-null   int64 
 1   information  1000 non-null   object
 2   type         1000 non-null   object
 3   text         1000 non-null   object
 4   lower        1000 non-null   object
dtypes: int64(1), object(4)
memory usage: 39.2+ KB

data cleaning¶

In [23]:
import string
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!

cleaning the stopwords, punctuations and stemming the words¶

In [24]:
def cleaning_text(text):
    # Remove punctuation
    no_punctuation = ''.join([char for char in text if char not in string.punctuation])

    # Lowercase the text
    no_punctuation_lower = no_punctuation.lower()

    # Tokenize the text into words
    words = nltk.word_tokenize(no_punctuation_lower)

    # Remove stopwords and non-alphabetic characters, and lemmatize
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words if word.lower() not in stopwords.words('english') and word.isalpha()]

    # This ensures that the lemmatization process is applied only to valid alphabetic words, discarding numbers and symbols.

    # Join the lemmatized words back into a sentence
    lemmatized_text = ' '.join(lemmatized_words)

    return lemmatized_text

check the length of the text¶

In [25]:
train_data["preprocessed_text"] = train_data["lower"].apply(cleaning_text)
test_data["preprocessed_text"] = test_data["lower"].apply(cleaning_text)
In [26]:
train_path = '/content/drive/My Drive/train_clean.csv'
test_path = '/content/drive/My Drive/test_clean.csv'
train_data.to_csv(train_path, index=False)
test_data.to_csv(test_path, index=False)

doing some visualization here based on the clean data¶

In [30]:
def generate_word_cloud(text, title, ax):
    wordcloud = WordCloud(
        max_font_size=100,
        max_words=100,
        background_color="black",
        scale=10,
        width=800,
        height=800
    ).generate(text)
    ax.imshow(wordcloud, interpolation="bilinear")
    ax.set_title(title, fontsize=16)
    ax.axis("off")

# Create figure and axes for the subplots
fig, axes = plt.subplots(2, 2, figsize=(20, 20))

# Generate and plot word clouds for each type
types = ['Positive', 'Negative', 'Irrelevant', 'Neutral']

for i, sentiment_type in enumerate(types):
    word_cloud_text = ''.join(train_data[train_data["type"] == sentiment_type].preprocessed_text)
    ax = axes[i//2, i%2]  # Position in subplot grid
    generate_word_cloud(word_cloud_text, sentiment_type, ax)

# Display the plots
plt.tight_layout()
plt.show()
No description has been provided for this image

So I optimize single wordcloud to bigram to quadrigram

In [41]:
def generate_bigram_word_cloud(text, title, ax):
    bigram_vectorizer = CountVectorizer(ngram_range=(2, 4))
    X_bigrams = bigram_vectorizer.fit_transform([text])

    # Get bigram feature names (the bigrams themselves) and their frequencies
    bigram_frequencies = X_bigrams.sum(axis=0).tolist()[0]
    bigram_features = bigram_vectorizer.get_feature_names_out()

    # Create a dictionary of bigrams and their frequencies
    bigram_dict = dict(zip(bigram_features, bigram_frequencies))

    # Generate the word cloud for bigrams
    bigram_wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(bigram_dict)

    # Plot the word cloud in the specified axes (ax)
    ax.imshow(bigram_wordcloud, interpolation='bilinear')
    ax.axis('off')
    ax.set_title(f'{title} Bigram Word Cloud', fontsize=20)

# Create figure and axes for the subplots (2x2 grid)
fig, axes = plt.subplots(2, 2, figsize=(16, 20))

# Generate and plot bigram word clouds for each sentiment type
types = ['Positive', 'Negative', 'Irrelevant', 'Neutral']

for i, sentiment_type in enumerate(types):
    # Combine all preprocessed text for the specific sentiment type
    word_cloud_text = ' '.join(train_data[train_data["type"] == sentiment_type].preprocessed_text.dropna())

    # Get the appropriate subplot axis
    ax = axes[i//2, i%2]

    # Generate and plot the bigram word cloud
    generate_bigram_word_cloud(word_cloud_text, sentiment_type, ax)

# Adjust layout for better visualization
plt.tight_layout()
plt.show()
No description has been provided for this image
In [27]:
train_data
Out[27]:
company_id information type text lower preprocessed_text
0 2401 Borderlands Positive im getting on borderlands and i will murder yo... im getting on borderlands and i will murder yo... im getting borderland murder
1 2401 Borderlands Positive I am coming to the borders and I will kill you... i am coming to the borders and i will kill you... coming border kill
2 2401 Borderlands Positive im getting on borderlands and i will kill you ... im getting on borderlands and i will kill you ... im getting borderland kill
3 2401 Borderlands Positive im coming on borderlands and i will murder you... im coming on borderlands and i will murder you... im coming borderland murder
4 2401 Borderlands Positive im getting on borderlands 2 and i will murder ... im getting on borderlands 2 and i will murder ... im getting borderland murder
... ... ... ... ... ... ...
71651 9200 Nvidia Positive Just realized that the Windows partition of my... just realized that the windows partition of my... realized window partition mac like year behind...
71652 9200 Nvidia Positive Just realized that my Mac window partition is ... just realized that my mac window partition is ... realized mac window partition year behind nvid...
71653 9200 Nvidia Positive Just realized the windows partition of my Mac ... just realized the windows partition of my mac ... realized window partition mac year behind nvid...
71654 9200 Nvidia Positive Just realized between the windows partition of... just realized between the windows partition of... realized window partition mac like year behind...
71655 9200 Nvidia Positive Just like the windows partition of my Mac is l... just like the windows partition of my mac is l... like window partition mac like year behind dri...

71656 rows × 6 columns

In [32]:
# Combine all preprocessed text into a single string
all_text = ' '.join(train_data['preprocessed_text'].dropna())


# Bar Chart of Top 20 Most Frequent Words
# Tokenize the text and count word frequencies
word_counts = Counter(all_text.split())
common_words = word_counts.most_common(20)

# Plot a bar chart
plt.figure(figsize=(10, 6))
sns.barplot(x=[word for word, count in common_words], y=[count for word, count in common_words])
plt.xticks(rotation=45)
plt.title('Top 20 Most Frequent Words', fontsize=16)
plt.ylabel('Frequency')
plt.show()
No description has been provided for this image
In [37]:
# Bigram Visualization
vectorizer = CountVectorizer(ngram_range=(2, 4))  # For bigrams
X_bigrams = vectorizer.fit_transform(train_data['preprocessed_text'].dropna())
bigram_counts = X_bigrams.sum(axis=0).tolist()[0]
bigram_labels = vectorizer.get_feature_names_out()

# Get the top 20 bigrams
bigram_count_pairs = list(zip(bigram_labels, bigram_counts))
top_20_bigrams = sorted(bigram_count_pairs, key=lambda x: x[1], reverse=True)[:20]

# Plot the bigram frequencies
plt.figure(figsize=(10, 6))
sns.barplot(x=[bigram for bigram, count in top_20_bigrams], y=[count for bigram, count in top_20_bigrams])
plt.xticks(rotation=90)
plt.title('Top 20 Bigrams, Trigrams and Quagrams in Preprocessed Text', fontsize=16)
plt.ylabel('Frequency')
plt.show()
No description has been provided for this image
  1. User cares about the game redemption
  2. italy, italy, italy. Italian people are crazy in gaming I think
  3. call of duty is probably the hottest game

some visualization will be available when creating the models¶

embeddings¶

  • Count-based embeddings (like Bag of Words or CountVectorizer) are simple and effective for basic tasks but lack context.
  • TF-IDF improves by considering word importance.
  • Word2Vec, GloVe, and FastText capture semantic relationships between words and are dense representations.
  • BERT, RoBERTa, and other transformer-based models produce contextual embeddings, which are more powerful for tasks requiring an understanding of meaning in context.
  • LDA and LSA can extract topic-based embeddings.